import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import random
import datetime
import scipy.stats as stats
from pandas import set_option
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
from sklearn.cluster import KMeans
from sklearn import metrics
from imblearn.over_sampling import SMOTE, RandomOverSampler, BorderlineSMOTE
from collections import Counter
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
from sklearn.feature_selection import VarianceThreshold, SelectFromModel
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import roc_curve, roc_auc_score, fbeta_score, make_scorer, recall_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import cross_validate, cross_val_score, KFold, StratifiedKFold, RepeatedKFold, LeaveOneOut
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier,RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from xgboost import XGBClassifier
from sklearn.decomposition import PCA
# 1A
path = 'signal-data.csv'
data = pd.read_csv(path)
data
| Time | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | ... | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008-07-19 11:55:00 | 3030.93 | 2564.00 | 2187.7333 | 1411.1265 | 1.3602 | 100.0 | 97.6133 | 0.1242 | 1.5005 | ... | NaN | 0.5005 | 0.0118 | 0.0035 | 2.3630 | NaN | NaN | NaN | NaN | -1 |
| 1 | 2008-07-19 12:32:00 | 3095.78 | 2465.14 | 2230.4222 | 1463.6606 | 0.8294 | 100.0 | 102.3433 | 0.1247 | 1.4966 | ... | 208.2045 | 0.5019 | 0.0223 | 0.0055 | 4.4447 | 0.0096 | 0.0201 | 0.0060 | 208.2045 | -1 |
| 2 | 2008-07-19 13:17:00 | 2932.61 | 2559.94 | 2186.4111 | 1698.0172 | 1.5102 | 100.0 | 95.4878 | 0.1241 | 1.4436 | ... | 82.8602 | 0.4958 | 0.0157 | 0.0039 | 3.1745 | 0.0584 | 0.0484 | 0.0148 | 82.8602 | 1 |
| 3 | 2008-07-19 14:43:00 | 2988.72 | 2479.90 | 2199.0333 | 909.7926 | 1.3204 | 100.0 | 104.2367 | 0.1217 | 1.4882 | ... | 73.8432 | 0.4990 | 0.0103 | 0.0025 | 2.0544 | 0.0202 | 0.0149 | 0.0044 | 73.8432 | -1 |
| 4 | 2008-07-19 15:22:00 | 3032.24 | 2502.87 | 2233.3667 | 1326.5200 | 1.5334 | 100.0 | 100.3967 | 0.1235 | 1.5031 | ... | NaN | 0.4800 | 0.4766 | 0.1045 | 99.3032 | 0.0202 | 0.0149 | 0.0044 | 73.8432 | -1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1562 | 2008-10-16 15:13:00 | 2899.41 | 2464.36 | 2179.7333 | 3085.3781 | 1.4843 | 100.0 | 82.2467 | 0.1248 | 1.3424 | ... | 203.1720 | 0.4988 | 0.0143 | 0.0039 | 2.8669 | 0.0068 | 0.0138 | 0.0047 | 203.1720 | -1 |
| 1563 | 2008-10-16 20:49:00 | 3052.31 | 2522.55 | 2198.5667 | 1124.6595 | 0.8763 | 100.0 | 98.4689 | 0.1205 | 1.4333 | ... | NaN | 0.4975 | 0.0131 | 0.0036 | 2.6238 | 0.0068 | 0.0138 | 0.0047 | 203.1720 | -1 |
| 1564 | 2008-10-17 05:26:00 | 2978.81 | 2379.78 | 2206.3000 | 1110.4967 | 0.8236 | 100.0 | 99.4122 | 0.1208 | NaN | ... | 43.5231 | 0.4987 | 0.0153 | 0.0041 | 3.0590 | 0.0197 | 0.0086 | 0.0025 | 43.5231 | -1 |
| 1565 | 2008-10-17 06:01:00 | 2894.92 | 2532.01 | 2177.0333 | 1183.7287 | 1.5726 | 100.0 | 98.7978 | 0.1213 | 1.4622 | ... | 93.4941 | 0.5004 | 0.0178 | 0.0038 | 3.5662 | 0.0262 | 0.0245 | 0.0075 | 93.4941 | -1 |
| 1566 | 2008-10-17 06:07:00 | 2944.92 | 2450.76 | 2195.4444 | 2914.1792 | 1.5978 | 100.0 | 85.1011 | 0.1235 | NaN | ... | 137.7844 | 0.4987 | 0.0181 | 0.0040 | 3.6275 | 0.0117 | 0.0162 | 0.0045 | 137.7844 | -1 |
1567 rows × 592 columns
# 1
data.describe()
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1561.000000 | 1560.000000 | 1553.000000 | 1553.000000 | 1553.000000 | 1553.0 | 1553.000000 | 1558.000000 | 1565.000000 | 1565.000000 | ... | 618.000000 | 1566.000000 | 1566.000000 | 1566.000000 | 1566.000000 | 1566.000000 | 1566.000000 | 1566.000000 | 1566.000000 | 1567.000000 |
| mean | 3014.452896 | 2495.850231 | 2200.547318 | 1396.376627 | 4.197013 | 100.0 | 101.112908 | 0.121822 | 1.462862 | -0.000841 | ... | 97.934373 | 0.500096 | 0.015318 | 0.003847 | 3.067826 | 0.021458 | 0.016475 | 0.005283 | 99.670066 | -0.867262 |
| std | 73.621787 | 80.407705 | 29.513152 | 441.691640 | 56.355540 | 0.0 | 6.237214 | 0.008961 | 0.073897 | 0.015116 | ... | 87.520966 | 0.003404 | 0.017180 | 0.003720 | 3.578033 | 0.012358 | 0.008808 | 0.002867 | 93.891919 | 0.498010 |
| min | 2743.240000 | 2158.750000 | 2060.660000 | 0.000000 | 0.681500 | 100.0 | 82.131100 | 0.000000 | 1.191000 | -0.053400 | ... | 0.000000 | 0.477800 | 0.006000 | 0.001700 | 1.197500 | -0.016900 | 0.003200 | 0.001000 | 0.000000 | -1.000000 |
| 25% | 2966.260000 | 2452.247500 | 2181.044400 | 1081.875800 | 1.017700 | 100.0 | 97.920000 | 0.121100 | 1.411200 | -0.010800 | ... | 46.184900 | 0.497900 | 0.011600 | 0.003100 | 2.306500 | 0.013425 | 0.010600 | 0.003300 | 44.368600 | -1.000000 |
| 50% | 3011.490000 | 2499.405000 | 2201.066700 | 1285.214400 | 1.316800 | 100.0 | 101.512200 | 0.122400 | 1.461600 | -0.001300 | ... | 72.288900 | 0.500200 | 0.013800 | 0.003600 | 2.757650 | 0.020500 | 0.014800 | 0.004600 | 71.900500 | -1.000000 |
| 75% | 3056.650000 | 2538.822500 | 2218.055500 | 1591.223500 | 1.525700 | 100.0 | 104.586700 | 0.123800 | 1.516900 | 0.008400 | ... | 116.539150 | 0.502375 | 0.016500 | 0.004100 | 3.295175 | 0.027600 | 0.020300 | 0.006400 | 114.749700 | -1.000000 |
| max | 3356.350000 | 2846.440000 | 2315.266700 | 3715.041700 | 1114.536600 | 100.0 | 129.252200 | 0.128600 | 1.656400 | 0.074900 | ... | 737.304800 | 0.509800 | 0.476600 | 0.104500 | 99.303200 | 0.102800 | 0.079900 | 0.028600 | 737.304800 | 1.000000 |
8 rows × 591 columns
There is large variation in the mean values between features.
The total number of entries vary across features indicating the presence of null/NaN values.
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1567 entries, 0 to 1566 Columns: 592 entries, Time to Pass/Fail dtypes: float64(590), int64(1), object(1) memory usage: 7.1+ MB
The original data is kept aside as data for final comparison of performnace of models with modified models.
data1 = data.drop('Time',axis=1)
data1['Pass/Fail'].unique()
array([-1, 1], dtype=int64)
data1['Pass/Fail'] = data1['Pass/Fail'].replace(to_replace=1,value=0)
data1['Pass/Fail'] = data1['Pass/Fail'].replace(to_replace=-1,value=1)
Changing datatype of target column as category
data1['Pass/Fail'] = data1['Pass/Fail'].astype('category')
# 2 A
data1.isnull().any().any()
True
Creating a for loop to remove all the features with 20%+ Null values
def remove_null(df, thres):
columns = df.columns
cols_remove = []
for i in columns:
if (df[i].isna().sum() / df.shape[0] >= thres):
cols_remove.append(i)
print('Number of features removed with more than 20% of null values \t:',
len(cols_remove))
df = df.drop(labels=cols_remove, axis=1)
return (df)
Removing features having more than 20% null values
data2 = remove_null(data1,0.2)
Number of features removed with more than 20% of null values : 32
-A function is created for imputing NaN entries with mean of the corresponding feature and we use this function on splitted data instead of whole data.
-If we use this function on data before splitting it might lead to data leakage as mean is calculated based on all datapoints which results in exposing test data hence we use this function after splitting data to avoid data leakage.
def imputer(df):
impute = SimpleImputer()
df_imputed = pd.DataFrame(impute.fit_transform(df),columns=df.columns)
return(df_imputed)
# Q2B
-The following function remove features with same value. Columns with same value for rows are identified by calculating the standard deviation.
-Columns with 0 standard deviation are removed.
def remove_duplicates(df):
df_std = df.std()
duplicate_features = df_std[df_std == 0].index
print('Number of features removed with same row values t:',
len(duplicate_features))
df = df.drop(labels=duplicate_features, axis=1)
return (df)
data3 = remove_duplicates(data2)
Number of features removed with same row values t: 116
# 2 C
Feature engineering steps to extract useful features:-
-Since the 'Time' column data is not useful in the context of the problem, it is dropped or removed in previous steps.
-Features having more than 20% null values are also removed.
-Features with same value in rows are identified and removed.
-Columns with 0 standard deviation are removed.
-Some other features are also removed based on their correlation using correlation function and Variance inflation
Factor(VIF).
-Along all these XGboost classifier is also used to identify and extract important features.
Now, splitting the data before further processing
Data is split at this stage itself to prevent data leakage between training and test sets.
# 4 A (seggregation)
X = data3.drop(labels='Pass/Fail',axis=1)
y = data3['Pass/Fail']
Adding a prefix 'f' to the column names for easy identification.
X = X.add_prefix('f')
# 4 C (train_test_split)
Using a train-test split of 80%-20%.
The split is stratified to maintain the same dependent class distribution for train and test data.
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=0.20,random_state=1)
print(X_train.shape)
print(X_test.shape)
(1253, 442) (314, 442)
The NaN entries in the train and test sets are imputed by mean of the corresponding feature. This is done as a part of 2A
X_train1 = imputer(X_train)
X_test1 = imputer(X_test)
Outliers Treatment
def outliers(df):
features = df.columns
for j in features:
Q1 = df[j].quantile(q=0.25)
Q2 = df[j].median()
Q3 = df[j].quantile(q=0.75)
IQR = Q3 - Q1
upper_boundary = Q3 + 1.5*IQR
lower_boundary = Q1 - 1.5*IQR
for i in range(0,df.shape[0]):
if(df[j][i] > upper_boundary or df[j][i]<lower_boundary):
df[j][i] = df[j].median()
return(df)
X_train2 = outliers(X_train1.reset_index(drop=True))
X_test2 = outliers(X_test1.reset_index(drop=True))
Outliers that exists beyond boundary are replaced with median.
# 2 D
Presence of Multi-collinearity can be ascertained by:
i) Checking for correlated independent features using correlation matrix. The threshold is selected as 0.80.
ii)Checking for Variance Inflation Factor (VIF) of each independent feature. Features with VIF>10 are removed.
# i) Creating a function to identify correlation among the features and using this function to remove features having correlation less than 80%
def remove_corr(df, threshold):
corr = pd.DataFrame(abs(df.corr()))
Featurei = []
Featurej = []
Corr = []
for i in corr.index:
for j in corr.columns:
if (corr.loc[i, j] > threshold and i != j):
if(i not in Featurei):
Featurei.append(i)
if(j not in Featurej):
Featurej.append(j)
Corr.append(corr.loc[i, j])
d = pd.DataFrame([Featurei, Featurej, Corr]).transpose()
display(d)
to_drop = set(d[0])
print('Number of features removed = ', len(to_drop))
return (df.drop(to_drop, axis=1))
# Removing features that have correlation less than 80% by using the above function
X_train3 = remove_corr(X_train2, 0.80)
| 0 | 1 | 2 | |
|---|---|---|---|
| 0 | f11 | f147 | 0.905348 |
| 1 | f12 | f282 | 0.906995 |
| 2 | f17 | f420 | 0.909175 |
| 3 | f18 | f18 | 0.979013 |
| 4 | f21 | f153 | 0.877337 |
| ... | ... | ... | ... |
| 306 | f583 | f584 | 0.81529 |
| 307 | f584 | f585 | 0.995825 |
| 308 | f585 | f583 | 0.81529 |
| 309 | f587 | f588 | 0.832242 |
| 310 | f588 | f587 | 0.832242 |
311 rows × 3 columns
Number of features removed = 311
X_test3 = X_test2[X_train3.columns]
ii)Checking multicollinearity with Variance Inflation Factor (VIF)
def remove_high_VIF(df, limit):
vif = pd.DataFrame()
high_vif = []
vif['features'] = df.columns
vif['VIF'] = [VIF(df.values,i) for i in range(len(df.columns))]
high_vif= vif[vif['VIF']>limit].features
print('Number of features with VIF > ',limit,'= ',len(high_vif))
return(df.drop(high_vif,axis=1))
X_train4 = remove_high_VIF(X_train3,10)
X_test4 = X_test3[X_train4.columns]
Number of features with VIF > 10 = 82
# 2 E
Modifications made on data (or) steps in cleaning data so far :-
-Dropping a column(Time) that is not useful in the context of problem.
-Changing the datatype of target column as 'category'.
-Removing features with more than 20% null values.
-Splitting data(To prevent data leakage).
-Imputing remaining null values with mean using imputer.
-Outliers Treatment.
-Removing columns with rows having same value.
-Identifying columns with '0' standard deviation and removing them.
-Cleaning data based on multicollinearity i) Correlation ii) Variation Inflation Factor(VIF)
-Removing features based on variance(threshold).
-As we are dealing with a number of features(even after preprocessing), where both important and less useful features are
present we have done XGBoost to get important features by doing feature importance to train models.
Above steps were done to prepare data for training models on various algorithms.
def remove_variance(df,thres):
set_option('display.precision', 3)
sel = VarianceThreshold(threshold=(thres * (1 - thres)))
var_df = pd.DataFrame({'Name':df.columns,
'Var':sel.fit(df).variances_})
dummy = var_df['Name'].where(var_df['Var']<(thres * (1 - thres)))
dummy = dummy.dropna()
print('Number of features removed: ',len(dummy))
return(dummy)
X_train5 = X_train4.drop(remove_variance(X_train4,0.80),axis=1)
X_test5 = X_test4[X_train5.columns]
Number of features removed: 27
print(X_train5.shape)
print(X_test5.shape)
(1253, 22) (314, 22)
As we cannot do analysis on hundreds of features it is better to take out important features for training and this is done by using XGBoost Classifier
Selection of best features using XGboost
def imp_features(X, y):
model = XGBClassifier(random_state=1)
model.fit(X,y)
fi_df = pd.DataFrame({'Name':pd.Series(model.feature_names_in_),
'FI':pd.Series(model.feature_importances_)})
dummy = fi_df['Name'].where(fi_df['FI']>0.01)
dummy = dummy.dropna()
display(fi_df.sort_values(by='FI',ascending=False))
print('Number of features with ''feature importance'' greater than 0 :',len(dummy))
return(dummy)
imp_features = imp_features(X_train5,y_train)
| Name | FI | |
|---|---|---|
| 1 | f59 | 0.071 |
| 3 | f412 | 0.065 |
| 14 | f486 | 0.062 |
| 18 | f499 | 0.059 |
| 13 | f485 | 0.056 |
| 9 | f480 | 0.054 |
| 2 | f129 | 0.053 |
| 16 | f488 | 0.049 |
| 0 | f24 | 0.049 |
| 4 | f418 | 0.044 |
| 20 | f511 | 0.044 |
| 6 | f432 | 0.043 |
| 17 | f489 | 0.041 |
| 21 | f589 | 0.041 |
| 15 | f487 | 0.040 |
| 7 | f433 | 0.040 |
| 8 | f468 | 0.037 |
| 12 | f484 | 0.035 |
| 11 | f483 | 0.031 |
| 19 | f500 | 0.030 |
| 10 | f482 | 0.028 |
| 5 | f419 | 0.027 |
Number of features with feature importance greater than 0 : 22
X_train6 = X_train5[imp_features]
X_test6 = X_test5[imp_features]
# 3 A
X_train6.hist(sharex=False, sharey=False, xlabelsize=1, ylabelsize=1,figsize=(12,12),grid=False)
plt.show()
X_train6.plot(kind='density', subplots=True, layout=(7,4), sharex=False, legend=False,
fontsize=1, figsize=(12,12))
plt.show()
Comments on univariate analysis:
Number of features are reduced to 22 making it easier for analysis.
The histogram and density plots indicate that most of the features are skewed towards right indicating that majority of data values are low and the presence of larger values.
Also some features display two peaks(clusters).
# 3 B
# Bivariate Analysis
For any two random features
sns.jointplot(data=X_train, x='f59', y='f412');
For all features
plt.figure(figsize=(12,8))
sns.barplot(data = X_train6);
Comments on bivariate Analysis:-
Correlation of each feature differes with the other.
Features are not strongly related.
# Multivariate Analysis
fig, ax = plt.subplots(figsize=(18,18))
sns.heatmap(X_train6.corr(), annot=True);
plt.figure(figsize=(20,20),dpi=200)
sns.pairplot(X_train6,kind='scatter',diag_kind='kde');
<Figure size 4000x4000 with 0 Axes>